{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Historical Feature Statistics with Feast, TFDV and Facets\n",
    "\n",
    "This tutorial covers how Feast can be used in conjunction with TFDV and Facets to retrieve statistics about feature datasets. \n",
    "\n",
    "The notebook showcases how Feast's integration with TFDV allows users to:\n",
    "\n",
    "1. Define TFX feature schemas and persist these properties in the Feature Store\n",
    "2. Validate new data against the defined schema\n",
    "3. Validate data already in Feast against the defined schema\n",
    "\n",
    "**Prerequisites**:\n",
    "\n",
    "- Feast running with at least 1 BigQuery warehouse store. This example uses a bigquery store with the name `historical`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "setting project to statistics...\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import pytest\n",
    "import pytz\n",
    "import uuid\n",
    "import time\n",
    "from datetime import datetime, timedelta\n",
    "\n",
    "from feast.client import Client\n",
    "from feast.entity import Entity\n",
    "from feast.feature import Feature\n",
    "from feast.feature_set import FeatureSet\n",
    "from feast.type_map import ValueType\n",
    "from google.protobuf import json_format\n",
    "from google.protobuf.duration_pb2 import Duration\n",
    "from tensorflow_metadata.proto.v0 import statistics_pb2\n",
    "from tensorflow_metadata.proto.v0 import schema_pb2\n",
    "import tensorflow_data_validation as tfdv\n",
    "\n",
    "PROJECT_NAME = \"statistics\"\n",
    "IRIS_DATASET = \"http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data\"\n",
    "BIGQUERY_STORE_NAME = \"historical\"\n",
    "client = Client(core_url=\"localhost:6565\")\n",
    "print(f\"setting project to {PROJECT_NAME}...\")\n",
    "client.set_project(PROJECT_NAME)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this example, we are using the iris dataset. More information about this dataset can be found [here](http://archive.ics.uci.edu/ml/datasets/iris)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal_length</th>\n",
       "      <th>sepal_width</th>\n",
       "      <th>petal_length</th>\n",
       "      <th>petal_width</th>\n",
       "      <th>class</th>\n",
       "      <th>datetime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Iris-setosa</td>\n",
       "      <td>2020-05-25 07:31:28.230582+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Iris-setosa</td>\n",
       "      <td>2020-05-25 07:31:28.230582+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Iris-setosa</td>\n",
       "      <td>2020-05-25 07:31:28.230582+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Iris-setosa</td>\n",
       "      <td>2020-05-25 07:31:28.230582+00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Iris-setosa</td>\n",
       "      <td>2020-05-25 07:31:28.230582+00:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   sepal_length  sepal_width  petal_length  petal_width        class  \\\n",
       "0           5.1          3.5           1.4          0.2  Iris-setosa   \n",
       "1           4.9          3.0           1.4          0.2  Iris-setosa   \n",
       "2           4.7          3.2           1.3          0.2  Iris-setosa   \n",
       "3           4.6          3.1           1.5          0.2  Iris-setosa   \n",
       "4           5.0          3.6           1.4          0.2  Iris-setosa   \n",
       "\n",
       "                          datetime  \n",
       "0 2020-05-25 07:31:28.230582+00:00  \n",
       "1 2020-05-25 07:31:28.230582+00:00  \n",
       "2 2020-05-25 07:31:28.230582+00:00  \n",
       "3 2020-05-25 07:31:28.230582+00:00  \n",
       "4 2020-05-25 07:31:28.230582+00:00  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris_feature_names = [\"sepal_length\",\"sepal_width\",\"petal_length\",\"petal_width\"]\n",
    "df = pd.read_csv(IRIS_DATASET, names=iris_feature_names + [\"class\"])\n",
    "\n",
    "# Add datetime to satisfy Feast\n",
    "current_datetime = datetime.utcnow().replace(tzinfo=pytz.utc)\n",
    "df['datetime'] = current_datetime - timedelta(days=1)\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TFDV schema as part of the feature set definition\n",
    "\n",
    "An integral part of TFDV is the feature [schemas](https://github.com/tensorflow/metadata/blob/master/tensorflow_metadata/proto/v0/schema.proto) that describe the expected properties of the data in a dataset, such as:\n",
    "- expected feature presence\n",
    "- type\n",
    "- expected domains of features\n",
    "\n",
    "These schemas, which can be [manually defined or generated by TFDV](https://www.tensorflow.org/tfx/data_validation/get_started#inferring_a_schema_over_the_data), can be then used to extend the definition of features within the feature set. As part of the spec, the schema is persisted within Feast, and is used for both in-flight data validation, as well as offline integration with TFDV.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:Ignoring feature datetime of type datetime64[ns, UTC]\n",
      "/Users/zhiling/.pyenv/versions/3.7.2/envs/test-feast/lib/python3.7/site-packages/tensorflow_data_validation/arrow/arrow_util.py:236: FutureWarning: Calling .data on ChunkedArray is provided for compatibility after Column was removed, simply drop this attribute\n",
      "  types.FeaturePath([column_name]), column.data.chunk(0), weights):\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Entity class(ValueType.STRING) manually updated (replacing an existing field).\n",
      "Feature sepal_length (ValueType.DOUBLE) added from dataframe.\n",
      "Feature sepal_width (ValueType.DOUBLE) added from dataframe.\n",
      "Feature petal_length (ValueType.DOUBLE) added from dataframe.\n",
      "Feature petal_width (ValueType.DOUBLE) added from dataframe.\n",
      "\n",
      "{\n",
      "  \"spec\": {\n",
      "    \"name\": \"iris\",\n",
      "    \"entities\": [\n",
      "      {\n",
      "        \"name\": \"class\",\n",
      "        \"valueType\": \"STRING\"\n",
      "      }\n",
      "    ],\n",
      "    \"features\": [\n",
      "      {\n",
      "        \"name\": \"sepal_length\",\n",
      "        \"valueType\": \"DOUBLE\",\n",
      "        \"presence\": {\n",
      "          \"minFraction\": 1.0,\n",
      "          \"minCount\": \"1\"\n",
      "        },\n",
      "        \"shape\": {\n",
      "          \"dim\": [\n",
      "            {\n",
      "              \"size\": \"1\"\n",
      "            }\n",
      "          ]\n",
      "        }\n",
      "      },\n",
      "      {\n",
      "        \"name\": \"sepal_width\",\n",
      "        \"valueType\": \"DOUBLE\",\n",
      "        \"presence\": {\n",
      "          \"minFraction\": 1.0,\n",
      "          \"minCount\": \"1\"\n",
      "        },\n",
      "        \"shape\": {\n",
      "          \"dim\": [\n",
      "            {\n",
      "              \"size\": \"1\"\n",
      "            }\n",
      "          ]\n",
      "        }\n",
      "      },\n",
      "      {\n",
      "        \"name\": \"petal_length\",\n",
      "        \"valueType\": \"DOUBLE\",\n",
      "        \"presence\": {\n",
      "          \"minFraction\": 1.0,\n",
      "          \"minCount\": \"1\"\n",
      "        },\n",
      "        \"shape\": {\n",
      "          \"dim\": [\n",
      "            {\n",
      "              \"size\": \"1\"\n",
      "            }\n",
      "          ]\n",
      "        }\n",
      "      },\n",
      "      {\n",
      "        \"name\": \"petal_width\",\n",
      "        \"valueType\": \"DOUBLE\",\n",
      "        \"presence\": {\n",
      "          \"minFraction\": 1.0,\n",
      "          \"minCount\": \"1\"\n",
      "        },\n",
      "        \"shape\": {\n",
      "          \"dim\": [\n",
      "            {\n",
      "              \"size\": \"1\"\n",
      "            }\n",
      "          ]\n",
      "        },\n",
      "        \"floatDomain\": {\n",
      "          \"min\": 0.0\n",
      "        }\n",
      "      }\n",
      "    ]\n",
      "  },\n",
      "  \"meta\": {}\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "# Infer a schema over the iris dataset. These values can be tweaked as necessary.\n",
    "stats = tfdv.generate_statistics_from_dataframe(df)\n",
    "schema = tfdv.infer_schema(statistics=stats)\n",
    "width_domain = schema_pb2.FloatDomain(min=0)\n",
    "tfdv.set_domain(schema, 'petal_width', width_domain)\n",
    "\n",
    "# Create a new FeatureSet or retrieve an existing FeatureSet in Feast\n",
    "feature_set = FeatureSet(name=\"iris\")\n",
    "feature_set.infer_fields_from_df(df[['datetime'] + iris_feature_names], \n",
    "                        entities=[Entity(name=\"class\", dtype=ValueType.STRING)])\n",
    "\n",
    "# Update the entities and features with constraints defined in the schema\n",
    "feature_set.import_tfx_schema(schema)\n",
    "print(feature_set)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Computing statistics over an ingested dataset\n",
    "\n",
    "Feast is able to compute statistics for any data that has been ingested into the system. Statistics can be computed over either discrete datasets using *dataset_ids* or periods of time using a specified time range.\n",
    "\n",
    "These statistics are computed at a historical store (caveat: only BQ is supported at the moment). The feature statistics returned in the form of TFX's `DatasetFeatureStatisticsList`, which can then be directly fed back into TFDV methods to either visualise the data statistics, or validate the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Feature set created: \"iris\"\n",
      "Waiting for feature set to be ready for ingestion...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 150/150 [00:01<00:00, 122.33rows/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ingestion complete!\n",
      "\n",
      "Ingestion statistics:\n",
      "Success: 150/150\n",
      "Removing temporary file(s)...\n",
      "\n",
      "ingestion id: 73ed84b1-1218-3702-b4c6-673503233264\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# Apply the featureset\n",
    "client.apply(feature_set)\n",
    "\n",
    "# When a dataset is ingested into Feast, a unique ingestion id referencing the ingested dataset is returned. \n",
    "ingestion_id = client.ingest(feature_set, df)\n",
    "print(\"\\ningestion id: \" + ingestion_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get statistics from Feast for the ingested dataset.\n",
    "# The statistics are calculated over the data in the store specified.\n",
    "stats = client.get_statistics(\n",
    "    feature_set_id='iris', \n",
    "    store=BIGQUERY_STORE_NAME, \n",
    "    features=iris_feature_names, \n",
    "    ingestion_ids=[ingestion_id])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualising statistics with facets\n",
    "\n",
    "Since Feast outputs statistics in a format compatible with the TFDV API, the stats object can be directly passed to `tfdv.visualize_statistics()` to visualise, in-line, the output statistics on [Facets](https://pair-code.github.io/facets/), allowing for easy and interactive exploration of the shape and distribution of the data inside Feast."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<iframe id='facets-iframe' width=\"100%\" height=\"500px\"></iframe>\n",
       "        <script>\n",
       "        facets_iframe = document.getElementById('facets-iframe');\n",
       "        facets_html = '<script src=\"https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js\"><\\/script><link rel=\"import\" href=\"https://raw.githubusercontent.com/PAIR-code/facets/master/facets-dist/facets-jupyter.html\"><facets-overview proto-input=\"CsoVCg5saHNfc3RhdGlzdGljcxB8GoAFEAEa6wQKDQh8GAEgAS0AAIA/QHwRdTtUt0P1EEAZQU5JByHi+D8pMzMzMzMz8z8xZmZmZmZmEkA5mpmZmZmZG0BChQIaGwnYo3A9CtcLQBE0MzMzMzMQQCEAAAAAAAAqQBobCTQzMzMzMxBAEXsUrkfhehJAIQAAAAAAADhAGhsJw/UoXI/CFEARC9ejcD0KF0AhAAAAAAAAMkAaGwkL16NwPQoXQBFTuB6F61EZQCEAAAAAAAAmQBobCXsUrkfhehJAEcP1KFyPwhRAIQAAAAAAADpAGhsJMzMzMzMz8z8RUrgehetR/D8hAAAAAAAAN0AaGwlTuB6F61EZQBGbmZmZmZkbQCEAAAAAAAAUQBobCUjhehSuRwdAEdijcD0K1wtAIQAAAAAAAAhAGhsJUrgehetR/D8RuB6F61G4AkAhAAAAAAAA8D9CpAIaGwkzMzMzMzPzPxEAAAAAAAD4PyEAAAAAAAAoQBobCQAAAAAAAPg/EQAAAAAAAAhAIQAAAAAAAChAGhsJAAAAAAAACEARAAAAAAAAEEAhAAAAAAAAKEAaGwkAAAAAAAAQQBGamZmZmZkRQCEAAAAAAAAoQBobCZqZmZmZmRFAEWZmZmZmZhJAIQAAAAAAAChAGhsJZmZmZmZmEkARmpmZmZmZE0AhAAAAAAAAKEAaGwmamZmZmZkTQBFmZmZmZmYUQCEAAAAAAAAoQBobCWZmZmZmZhRAEWZmZmZmZhZAIQAAAAAAAChAGhsJZmZmZmZmFkARmpmZmZmZF0AhAAAAAAAAKEAaGwmamZmZmZkXQBGamZmZmZkbQCEAAAAAAAAoQCABQg4KDHBldGFsX2xlbmd0aBq6BRABGqUFCg0IfBgBIAEtAACAP0B8EQ3V7VDdDhhAGSSMNA98YOk/KZqZmZmZmRFAMQAAAAAAABhAOZqZmZmZmR9AQr8CGhsJmpmZmZmZGEARAAAAAAAAGkAhAAAAAAAANEAaGwk0MzMzMzMXQBGamZmZmZkYQCEAAAAAAAAuQBobCQAAAAAAABpAEWZmZmZmZhtAIQAAAAAAADJAGhsJzczMzMzMFUARNDMzMzMzF0AhAAAAAAAAOUAaGwnNzMzMzMwcQBE0MzMzMzMeQCEAAAAAAAAUQBobCQAAAAAAABNAEWdmZmZmZhRAIQAAAAAAADFAGhsJZmZmZmZmG0ARzczMzMzMHEAhAAAAAAAAGEAaGwmamZmZmZkRQBEAAAAAAAATQCEAAAAAAAAUQBobCWdmZmZmZhRAEc3MzMzMzBVAIQAAAAAAABxAGhsJNDMzMzMzHkARmpmZmZmZH0AhAAAAAAAAFEAaGwmamZmZmZkfQBEAAAAAAIAgQCEAAAAAAADwP0KkAhobCZqZmZmZmRFAEQAAAAAAABRAIQAAAAAAAChAGhsJAAAAAAAAFEARzczMzMzMFEAhAAAAAAAAKEAaGwnNzMzMzMwUQBFmZmZmZmYWQCEAAAAAAAAoQBobCWZmZmZmZhZAETMzMzMzMxdAIQAAAAAAAChAGhsJMzMzMzMzF0ARAAAAAAAAGEAhAAAAAAAAKEAaGwkAAAAAAAAYQBEzMzMzMzMZQCEAAAAAAAAoQBobCTMzMzMzMxlAEZqZmZmZmRlAIQAAAAAAAChAGhsJmpmZmZmZGUARzczMzMzMGkAhAAAAAAAAKEAaGwnNzMzMzMwaQBEAAAAAAAAcQCEAAAAAAAAoQBobCQAAAAAAABxAEZqZmZmZmR9AIQAAAAAAAChAIAFCDgoMc2VwYWxfbGVuZ3RoGrkFEAEapQUKDQh8GAEgAS0AAIA/QHwRwGT5S5a/B0AZ1hJ++kBb2T8pAAAAAAAAAEAxAAAAAAAACEA5zczMzMzMEEBCvwIaGwkK16NwPQoHQBHNzMzMzMwIQCEAAAAAAAA/QBobCZDC9ShcjwpAEVK4HoXrUQxAIQAAAAAAACZAGhsJzczMzMzMCEARkML1KFyPCkAhAAAAAAAAO0AaGwnD9Shcj8IBQBGF61G4HoUDQCEAAAAAAAAcQBobCUjhehSuRwVAEQrXo3A9CgdAIQAAAAAAADdAGhsJhetRuB6FA0ARSOF6FK5HBUAhAAAAAAAAKkAaGwkAAAAAAAAAQBHD9Shcj8IBQCEAAAAAAAAQQBobCRWuR+F6FA5AEdijcD0K1w9AIQAAAAAAABBAGhsJUrgehetRDEARFa5H4XoUDkAhAAAAAAAAAEAaGwnYo3A9CtcPQBHNzMzMzMwQQCEAAAAAAADwPxobCc3MzMzMzBBAEa5H4XoUrhFAIQAAAAAAAPA/QqQCGhsJAAAAAAAAAEARAAAAAAAABEAhAAAAAAAAKEAaGwkAAAAAAAAEQBGamZmZmZkFQCEAAAAAAAAoQBobCZqZmZmZmQVAEWZmZmZmZgZAIQAAAAAAAChAGhsJZmZmZmZmBkARMzMzMzMzB0AhAAAAAAAAKEAaGwkzMzMzMzMHQBEAAAAAAAAIQCEAAAAAAAAoQBobCQAAAAAAAAhAEQAAAAAAAAhAIQAAAAAAAChAGhsJAAAAAAAACEARzczMzMzMCEAhAAAAAAAAKEAaGwnNzMzMzMwIQBGamZmZmZkJQCEAAAAAAAAoQBobCZqZmZmZmQlAETMzMzMzMwtAIQAAAAAAAChAGhsJMzMzMzMzC0ARzczMzMzMEEAhAAAAAAAAKEAgAUINCgtzZXBhbF93aWR0aBq5BRABGqUFCg0IfBgBIAEtAACAP0B8EWGy/CXLX/Y/GYUfa15j+OU/KZqZmZmZmbk/MQAAAAAAAPg/OQAAAAAAAARAQr8CGhsJAAAAAAAABEAR61G4HoXrBUAhAAAAAAAACEAaGwnNzMzMzMz0PxGkcD0K16P4PyEAAAAAAIBAQBobCT0K16NwPeo/EfYoXI/C9fA/IQAAAAAAABxAGhsJ9ihcj8L18D8RzczMzMzM9D8hAAAAAAAAIEAaGwmkcD0K16P4PxF7FK5H4Xr8PyEAAAAAAAAYQBobCZqZmZmZmbk/EcL1KFyPwtU/IQAAAAAAADRAGhsJKVyPwvUoAEARFa5H4XoUAkAhAAAAAAAAIkAaGwl7FK5H4Xr8PxEpXI/C9SgAQCEAAAAAAAA3QBobCRWuR+F6FAJAEQAAAAAAAARAIQAAAAAAACZAGhsJwvUoXI/C1T8Rj8L1KFyP4j8hAAAAAAAACEAaGwmPwvUoXI/iPxE9CtejcD3qPyEAAAAAAADwP0KkAhobCZqZmZmZmbk/EZqZmZmZmck/IQAAAAAAAChAGhsJmpmZmZmZyT8RAAAAAAAA8D8hAAAAAAAAKEAaGwkAAAAAAADwPxEzMzMzMzPzPyEAAAAAAAAoQBobCTMzMzMzM/M/Ec3MzMzMzPQ/IQAAAAAAAChAGhsJzczMzMzM9D8RAAAAAAAA+D8hAAAAAAAAKEAaGwkAAAAAAAD4PxGamZmZmZn5PyEAAAAAAAAoQBobCZqZmZmZmfk/Ec3MzMzMzPw/IQAAAAAAAChAGhsJzczMzMzM/D8RAAAAAAAAAEAhAAAAAAAAKEAaGwkAAAAAAAAAQBFmZmZmZmYCQCEAAAAAAAAoQBobCWZmZmZmZgJAEQAAAAAAAARAIQAAAAAAAChAIAFCDQoLcGV0YWxfd2lkdGg=\"></facets-overview>';\n",
       "        facets_iframe.srcdoc = facets_html;\n",
       "         facets_iframe.id = \"\";\n",
       "         setTimeout(() => {\n",
       "           facets_iframe.setAttribute('height', facets_iframe.contentWindow.document.body.offsetHeight + 'px')\n",
       "         }, 1500)\n",
       "         </script>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "tfdv.visualize_statistics(stats)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Validating correctness of subsequent datasets \n",
    "\n",
    "While it is useful to explore dataset statistics using facets, since we have already defined a schema that specifies a dataset's bounds of correctness, we can leverage TFDV's `validate_statistics` to validate if subsequent datasets are problematic or not. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It is possible to validate correctness of a new dataset prior to ingestion by retrieving the schema from the feature set, and comparing computed statistics against that schema. \n",
    "\n",
    "This can be useful if we want to avoid ingesting problematic data into Feast."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:Ignoring feature datetime of type datetime64[ns, UTC]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Anomaly short description</th>\n",
       "      <th>Anomaly long description</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Feature name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>'petal_width'</th>\n",
       "      <td>Out-of-range values</td>\n",
       "      <td>Unexpectedly low values: -1&lt;-1(upto six significant digits)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>'class'</th>\n",
       "      <td>New column</td>\n",
       "      <td>New column (column in data but not in schema)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              Anomaly short description  \\\n",
       "Feature name                              \n",
       "'petal_width'  Out-of-range values        \n",
       "'class'        New column                 \n",
       "\n",
       "                                                  Anomaly long description  \n",
       "Feature name                                                                \n",
       "'petal_width'  Unexpectedly low values: -1<-1(upto six significant digits)  \n",
       "'class'        New column (column in data but not in schema)                "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Ingest a new dataset with obviously incorrect data\n",
    "df_2 = pd.DataFrame(\n",
    "    {\n",
    "        \"datetime\": current_datetime,\n",
    "        \"class\": [\"Iris-setosa\", \"Iris-virginica\", \"Iris-nonsensica\"],\n",
    "        \"sepal_length\": [4.3, 6.9, 12],\n",
    "        \"sepal_width\": [3.0, 2.8, 1.1],\n",
    "        \"petal_length\": [1.2, 4.9, 2.2],\n",
    "        \"petal_width\": [0.1, 1.8, -1.0]\n",
    "    }\n",
    ")\n",
    "\n",
    "# Validate correctness\n",
    "stats_2 = tfdv.generate_statistics_from_dataframe(df_2)\n",
    "anomalies = tfdv.validate_statistics(statistics=stats_2, schema=feature_set.export_tfx_schema())\n",
    "tfdv.display_anomalies(anomalies)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Alternatively, the data can be ingested into Feast, and the statistics computed at the store. This has the benefit of offloading statistics computation for large datasets to Feast."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  0%|          | 0/3 [00:00<?, ?rows/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Waiting for feature set to be ready for ingestion...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3/3 [00:01<00:00,  2.85rows/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ingestion complete!\n",
      "\n",
      "Ingestion statistics:\n",
      "Success: 3/3\n",
      "Removing temporary file(s)...\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Anomaly short description</th>\n",
       "      <th>Anomaly long description</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Feature name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>'petal_width'</th>\n",
       "      <td>Out-of-range values</td>\n",
       "      <td>Unexpectedly low values: -1&lt;-1(upto six significant digits)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              Anomaly short description  \\\n",
       "Feature name                              \n",
       "'petal_width'  Out-of-range values        \n",
       "\n",
       "                                                  Anomaly long description  \n",
       "Feature name                                                                \n",
       "'petal_width'  Unexpectedly low values: -1<-1(upto six significant digits)  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Ingest the data into Feast\n",
    "ingestion_id_2 = client.ingest(feature_set, df_2)\n",
    "time.sleep(10) # Sleep is not necessary if not using DirectRunner\n",
    "\n",
    "# Compute statistics over the new dataset\n",
    "stats_2 = client.get_statistics(\n",
    "    feature_set_id='iris', \n",
    "    store=BIGQUERY_STORE_NAME, \n",
    "    features=iris_feature_names, \n",
    "    ingestion_ids=[ingestion_id_2])\n",
    "\n",
    "# Detect anomalies in the dataset\n",
    "anomalies = tfdv.validate_statistics(statistics=stats_2, schema=feature_set.export_tfx_schema())\n",
    "tfdv.display_anomalies(anomalies)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}